package org.commoncrawl.tools; import java.io.BufferedReader; import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPInputStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PositionedReadable; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.DefaultCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.util.Progressable; import org.commoncrawl.util.CCStringUtils; import com.amazonaws.AmazonServiceException; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.common.io.CountingInputStream; /** * Hacked together utility to breakup the bulk Blekko URL list file into smaller SequenceFile * chunks and push them up to S3. * * @author rana */ public class BlekkoURLListTransfer { public static final Log LOG = LogFactory.getLog(BlekkoURLListTransfer.class); static Options options = new Options(); static { options.addOption( OptionBuilder.withArgName("awsKey").hasArg().withDescription("AWS Key").isRequired().create("awsKey")); options.addOption( OptionBuilder.withArgName("awsSecret").hasArg().withDescription("AWS Secret").isRequired().create("awsSecret")); options.addOption( OptionBuilder.withArgName("s3bucket").hasArg().withDescription("S3 bucket name").isRequired().create("s3bucket")); options.addOption( OptionBuilder.withArgName("s3path").hasArg().withDescription("S3 path prefix").isRequired().create("s3path")); options.addOption( OptionBuilder.withArgName("input").hasArg().withDescription("Input URL List").isRequired().create("input")); } static void printUsage() { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp( "BlekkoURLListTransfer", options ); } private static final String IN_MEMORY_FS_URI = "imfs://localhost/"; private static final int HOLDING_BUFFER_SIZE = 1 << 16; private static final int SCAN_BUFFER_SIZE = HOLDING_BUFFER_SIZE/4; // 1 << 20; private static final int CANNED_FILE_COMPRESSED_BLOCK_SIZE = 1 << 20; // 1 MB private static final int CANNED_FILE_SIZE = CANNED_FILE_COMPRESSED_BLOCK_SIZE * 100; private static final int CANNED_FILE_SIZE_PAD = CANNED_FILE_COMPRESSED_BLOCK_SIZE * 10; private static final Path CANNED_FILE_PATH = new Path("/tmp/cannedFile"); private static final String COMPLETION_FILE_SUFFIX = "COMPLETE"; private static class InMemoryFSHack extends FileSystem { static class CustomInputStream extends DataInputBuffer implements PositionedReadable, Seekable { int offset; int len; public CustomInputStream(byte[] data,int offset,int length) { this.offset = offset; this.len = length; super.reset(data,offset, length); } @Override public void readFully(long position, byte[] buffer) throws IOException { read(position,buffer,0,buffer.length); } @Override public void readFully(long position, byte[] buffer, int offset, int length) throws IOException { read(position,buffer,offset,length); } @Override public int read(long position, byte[] buffer, int offset, int length) throws IOException { System.arraycopy(this.getData(), (int)position, buffer, 0, buffer.length); return length; } @Override public void seek(long pos) throws IOException { super.reset(getData(),offset+(int)pos,len-(int)pos); } @Override public long getPos() throws IOException { return super.getPosition(); } @Override public boolean seekToNewSource(long targetPos) throws IOException { return false; } } InMemoryFSHack(Configuration conf) { setConf(conf); } DataOutputBuffer outputStream = new DataOutputBuffer(CANNED_FILE_SIZE + CANNED_FILE_SIZE_PAD); @Override public URI getUri() { try { return new URI(IN_MEMORY_FS_URI); } catch (URISyntaxException e) { return null; } } @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { CustomInputStream inputStream = new CustomInputStream(outputStream.getData(),0,outputStream.getLength()); inputStream.reset(outputStream.getData(),0,outputStream.getLength()); return new FSDataInputStream(inputStream); } @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { outputStream.reset(); return new FSDataOutputStream(outputStream, null); } @Override public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException { return null; } @Override public boolean rename(Path src, Path dst) throws IOException { // TODO Auto-generated method stub return false; } @Override @Deprecated public boolean delete(Path f) throws IOException { return false; } @Override public boolean delete(Path f, boolean recursive) throws IOException { outputStream.reset(); return true; } public DataOutputBuffer swapBuffers() { DataOutputBuffer out = outputStream; outputStream = new DataOutputBuffer(CANNED_FILE_SIZE + CANNED_FILE_SIZE_PAD); return out; } @Override public FileStatus[] listStatus(Path f) throws IOException { return null; } @Override public void setWorkingDirectory(Path new_dir) { } @Override public Path getWorkingDirectory() { return null; } @Override public boolean mkdirs(Path f, FsPermission permission) throws IOException { return false; } @Override public FileStatus getFileStatus(Path f) throws IOException { return new FileStatus(outputStream.getLength(),false,1,1,1,CANNED_FILE_PATH); } } private static long readWriteNextLine(CountingInputStream is,ByteBuffer inputBuffer,DataOutputBuffer outputBuffer,SequenceFile.Writer writer)throws IOException { outputBuffer.reset(); for (;;) { if (inputBuffer.remaining() == 0) { int bytesRead = is.read(inputBuffer.array()); if (bytesRead == -1) { throw new EOFException(); } else { inputBuffer.clear(); inputBuffer.limit(bytesRead); } } int scanStartPos = inputBuffer.position(); boolean eos=false; while (inputBuffer.remaining() != 0) { byte nextChar = inputBuffer.get(); if ((nextChar == '\n') || (nextChar == '\r')) { eos=true; break; } } // put whatever we read into the output buffer .. . outputBuffer.write(inputBuffer.array(),scanStartPos,inputBuffer.position() - scanStartPos); if (eos) { break; } } String line = new String(outputBuffer.getData(),0,outputBuffer.getLength(),Charset.forName("UTF-8")); int spaceDelimiter = line.indexOf(' '); if (spaceDelimiter != -1 && spaceDelimiter < line.length() - 1) { String url = line.substring(0, spaceDelimiter); String metadata = line.substring(spaceDelimiter+1); if (url.length() != 0 && metadata.length() != 0) { writer.append(new Text(url), new Text(metadata)); // System.out.println("URL:" + url + " Metadata:" + metadata); } } return is.getCount() + inputBuffer.position(); } private static boolean scanForCompletionFile(AmazonS3Client s3Client,String s3Bucket,String s3Path)throws IOException { String finalPath = s3Path + COMPLETION_FILE_SUFFIX; try { s3Client.getObjectMetadata(s3Bucket,finalPath); return true; } catch (AmazonServiceException e) { if (e.getStatusCode() == 404) { return false; } else { throw new IOException(e); } } } private static Pattern seqFilePattern = Pattern.compile(".*/([0-9]*)\\.seq"); private static SequenceFile.Writer flushFile(InMemoryFSHack fs,Configuration conf, Uploader uploader,String s3Bucket,String s3FolderPath,long lastValidReadPos, SequenceFile.Writer writer) throws IOException { writer.close(); String fullS3Path =s3FolderPath + Long.toString(lastValidReadPos) + ".seq"; // ok detach the buffer DataOutputBuffer bufferOut = fs.swapBuffers(); DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(bufferOut.getData(),0,bufferOut.getLength()); QueueItem queueItem = new QueueItem(s3Bucket,fullS3Path,inputStream); LOG.info("Queueing for Upload File:" + fullS3Path + "of size:" + fs.getFileStatus(CANNED_FILE_PATH).getLen() + " to S3"); try { uploader.queue.put(queueItem); } catch (InterruptedException e) { } LOG.info("Queued for Upload File:" + fullS3Path + "of size:" + fs.getFileStatus(CANNED_FILE_PATH).getLen() + " to S3"); return createWriter(fs, conf); } private static long scanForLastValidOffset(AmazonS3Client s3Client,String s3Bucket,String s3Path) throws IOException { ObjectListing listing = s3Client.listObjects(s3Bucket,s3Path); boolean done = false; long lastValidOffsetOut = 0L; do { for (S3ObjectSummary summary : listing.getObjectSummaries()) { Matcher seqFileMatcher = seqFilePattern.matcher(summary.getKey()); if (seqFileMatcher.matches()) { lastValidOffsetOut = Math.max(lastValidOffsetOut,Long.parseLong(seqFileMatcher.group(1))); } } if (listing.isTruncated()) { listing = s3Client.listNextBatchOfObjects(listing); } else { done = true; } } while (!done); return lastValidOffsetOut; } private static SequenceFile.Writer createWriter(FileSystem fs,Configuration conf) throws IOException { return SequenceFile.createWriter(fs,conf,CANNED_FILE_PATH,Text.class,Text.class,CompressionType.BLOCK,new SnappyCodec()); } public static class QueueItem { String bucket; String path; DataInputBuffer payload; public QueueItem() { } public QueueItem(String bucket,String path,DataInputBuffer payload) { this.bucket = bucket; this.path = path; this.payload = payload; } } public static class Uploader { static final int MAX_BACKLOG_SIZE = 15; static final int UPLOADER_THREAD_COUNT = 10; LinkedBlockingQueue<QueueItem> queue = new LinkedBlockingQueue<QueueItem>(MAX_BACKLOG_SIZE); Thread threads[] = new Thread[UPLOADER_THREAD_COUNT]; AmazonS3Client s3Client; Semaphore runningWaitSemaphore = new Semaphore( - (UPLOADER_THREAD_COUNT -1)); public Uploader(String awsAccessKey,String awsSecret) throws IOException { BasicAWSCredentials credentials = new BasicAWSCredentials( awsAccessKey,awsSecret); // create the client ... s3Client = new AmazonS3Client(credentials); for (int i=0;i<UPLOADER_THREAD_COUNT;++i) { // closure the thread index ... final int threadIndex = i; threads[threadIndex] = new Thread(new Runnable() { @Override public void run() { try { while (true) { try { QueueItem item = queue.take(); if (item.payload == null) { LOG.info("UPLOADER_THREAD[" + threadIndex + "]:Received NULL Queue Item. Exiting"); break; } else { boolean done = false; int retryCount = 0; while (!done) { try { long flushStartTime = System.currentTimeMillis(); ObjectMetadata metadata = new ObjectMetadata(); metadata.setContentLength(item.payload.getLength()); s3Client.putObject(item.bucket, item.path, item.payload,metadata); long flushEndTime = System.currentTimeMillis(); LOG.info("UPLOADER_THREAD[" + threadIndex + "]: Flushing Finished for File:" + item.path + "of size:" + item.payload.getLength() + " Took:" + (flushEndTime-flushStartTime)); done = true; } catch (Exception e) { LOG.error("UPLOADER_THREAD[" + threadIndex + "]: Exception While Flusing File:" + item.path + " of size:" + item.payload.getLength() + " Exception:" + CCStringUtils.stringifyException(e) + " RetryCount:" + retryCount); ++retryCount; } } } } catch (InterruptedException e) { } } LOG.info("UPLOADER_THREAD[" + threadIndex + "]: DONE"); } finally { runningWaitSemaphore.release(); } } }); threads[threadIndex].start(); } } } public static void main(String[] args) { CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse( options, args ); BasicAWSCredentials credentials = new BasicAWSCredentials( cmdLine.getOptionValue("awsKey"), cmdLine.getOptionValue("awsSecret")); // create the client ... AmazonS3Client s3Client = new AmazonS3Client(credentials); // create uploader thread ... Uploader uploader = new Uploader(cmdLine.getOptionValue("awsKey"), cmdLine.getOptionValue("awsSecret")); // get length of input file ... File inputFile = new File(cmdLine.getOptionValue("input")); // allocate in memory file system Configuration conf = new Configuration(); conf.setInt("io.seqfile.compress.blocksize",CANNED_FILE_COMPRESSED_BLOCK_SIZE); InMemoryFSHack fsHack = new InMemoryFSHack(conf); DataOutputBuffer outputBuffer = new DataOutputBuffer(HOLDING_BUFFER_SIZE); // get bucket and input path parameters String s3bucket = cmdLine.getOptionValue("s3bucket"); String s3path = cmdLine.getOptionValue("s3path"); // scan for completion marker ... if (scanForCompletionFile(s3Client,s3bucket,s3path) == false) { // scan existing files to find last decompressed offset ... long lastReadPos = scanForLastValidOffset(s3Client,s3bucket,s3path); LOG.info("Last Valid Read Pos:" + lastReadPos); // open input stream ... CountingInputStream countingInputStream = new CountingInputStream(new FileInputStream(inputFile)); // setup inflater ... LOG.info("Initializing GZIP Stream for File at:" + inputFile); GZIPInputStream inflater = new GZIPInputStream(countingInputStream,SCAN_BUFFER_SIZE); // init counting stream to wrap inflater CountingInputStream countingDecompressedStream = new CountingInputStream(inflater); // skip to last scan offset inflater.skip(lastReadPos); ByteBuffer scanBuffer = ByteBuffer.allocate(SCAN_BUFFER_SIZE); boolean eof = false; //read input file, collecting lines into buffer ... long lineCount = 0; // create sequence file ... SequenceFile.Writer writer = createWriter(fsHack, conf); while (!eof) { try { lastReadPos = readWriteNextLine(countingDecompressedStream, scanBuffer, outputBuffer,writer); ++lineCount; if (lineCount % 10000 == 0) { LOG.info("Read 10000 lines RAW Pos:" + countingInputStream.getCount() + " lastReadPos:" + lastReadPos + " TotalLines:" + lineCount); } } catch (EOFException e) { LOG.info("HIT EOF AT Raw Pos:" + countingInputStream.getCount() + " lastReadPos:" + lastReadPos); eof = true; } // once our buffer flush threshold is hit or if eof .. . if (eof || writer.getLength() >= CANNED_FILE_SIZE) { // flush buffer to s3 writer = flushFile(fsHack,conf,uploader,s3bucket,s3path,lastReadPos,writer); // reset output buffer outputBuffer.reset(); } } LOG.info("Done Processing Data. Queueing Empty Item"); uploader.queue.put(new QueueItem()); LOG.info("Waiting for Uploader Threads to Die"); uploader.runningWaitSemaphore.acquireUninterruptibly(); LOG.info("Uploader Thread Dead. Exiting"); } } catch (ParseException e) { System.out.println("Error parsing command line:" + e.getMessage()); } catch( Exception exp ) { // oops, something went wrong LOG.error(CCStringUtils.stringifyException(exp)); printUsage(); } } }